library(ggplot2)
library(stringi)
library(gridExtra)
library(dendextend)
library(kableExtra)
library(limma)
library(psych)
library(tidyverse)
library(CONSTANd) # install from source: https://github.com/PDiracDelta/CONSTANd/
source('other_functions.R')
source('plotting_functions.R')
data.list <- readRDS(params$input_data_p)
dat.l <- data.list$dat.l # data in long format
dat.w <- data.list$dat.w # data in wide format
# which proteins were spiked in?
spiked.proteins <- dat.l %>% distinct(Protein) %>% filter(stri_detect(Protein, fixed='ups')) %>% pull %>% as.character
tmp=dat.l %>% distinct(Protein) %>% pull %>% as.character
# protein subsampling
if (params$subsample_p>0 & params$subsample_p==floor(params$subsample_p) & params$subsample_p<=length(tmp)){
sub.prot <- tmp[sample(1:length(tmp), size=params$subsample_p)]
if (length(spiked.proteins)>0) sub.prot <- c(sub.prot,spiked.proteins)
dat.l <- dat.l %>% filter(Protein %in% sub.prot)
dat.w <- dat.w %>% filter(Protein %in% sub.prot)
}
# specify # of varying component variants and their names
variant.names <- c('log2_intensity', 'intensity', 'ratio_rowmean','ratio_condmean','ratio_condpairs','ratio_onetag')
n.comp.variants <- length(variant.names)
scale.vec <- c('log', 'raw', 'raw','raw','raw','raw') # ratios are considered raw, because they are basically mean-normalized intensities
# get some data parameters created in the data_prep script
referenceCondition <- data.list$data.params$referenceCondition
channelsOrdered <- data.list$data.params$channelsOrdered
condition.color <- data.list$data.params$condition.color
ma.onesample.num <- data.list$data.params$ma.onesample.num
ma.onesample.denom <- data.list$data.params$ma.onesample.denom
ma.allsamples.num <- data.list$data.params$ma.allsamples.num
ma.allsamples.denom <- data.list$data.params$ma.allsamples.denom
# create data frame with sample information
sample.info <- get_sample_info(dat.l, condition.color)
# get channel names
channelNames <- remove_factors(unique(sample.info$Channel))
dat.unit.l <- emptyList(variant.names)
dat.unit.l$log2_intensity <- dat.l %>% mutate(response=log2(intensity)) %>% select(-intensity)
dat.unit.l$intensity <- dat.l %>% rename(response=intensity)
ratio_rowmean: Calculate ratio (per PSM) with respect to average intensity within run, or in other words: each value is divided by the row mean.
# use half-wide data to compute within-run average of PSM channels corresponding to the reference Condition
refCols <- sample.info %>% distinct(Channel) %>% pull(Channel)
denom.df=dat.l %>% pivot_wider(id_cols=-one_of('Condition', 'BioReplicate'),names_from='Channel', values_from='intensity')
denom.df$denom=apply(denom.df[,refCols], 1, function(x) mean(x, na.rm=T))
denom.df=denom.df[,c('Run', 'Protein', 'Peptide', 'RT', 'Charge', 'PTM', 'denom')]
dat.unit.l$ratio_rowmean <- dat.l %>% left_join(denom.df, by=c('Run', 'Protein', 'Peptide', 'RT', 'Charge', 'PTM')) %>% mutate(response=intensity/denom) %>% select(-c(intensity, denom))
ratio_condmean: each sample is divided by the average of samples belonging to the reference condition (here ‘r referenceCondition’)
# use half-wide data to compute within-run average of PSM channels corresponding to the reference Condition
refCols <- sample.info %>% filter(Condition==referenceCondition) %>% distinct(Channel) %>% pull
denom.df=dat.l %>% filter(Condition==referenceCondition) %>% pivot_wider(id_cols=-one_of('Condition', 'BioReplicate'),names_from='Channel', values_from='intensity')
denom.df$denom=apply(denom.df[,refCols], 1, function(x) mean(x, na.rm=T))
denom.df=denom.df[,c('Run', 'Protein', 'Peptide', 'RT', 'Charge', 'PTM', 'denom')]
dat.unit.l$ratio_condmean <- dat.l %>% left_join(denom.df, by=c('Run', 'Protein', 'Peptide', 'RT', 'Charge', 'PTM')) %>% mutate(response=intensity/denom) %>% select(-c(intensity, denom))
ratio_condpairs: compute ratios using samples from a selected condition as ratio denominator (channels paired from left to right)
dat.unit.l$ratio_condpairs <- compute_ratio(dat.w, sample.info, channelsOrdered, referenceCondition)
ratio_onetag: divide by one channel reserved for the reference condition (here the first channel of 0.5)
refCols <- sample.info %>% filter(Condition %in% referenceCondition) %>% group_by(Run) %>% top_n(1, Channel) %>% pull(Sample) %>% as.character()
left_data=dat.unit.l$intensity %>% filter(paste(Run, Channel, sep=':') %in% refCols) %>% select('Run', 'Protein', 'Peptide', 'RT', 'Charge', 'PTM', 'response') %>% rename(denom=response)
dat.unit.l$ratio_onetag=left_join(dat.unit.l$intensity, left_data, by=c('Run', 'Protein', 'Peptide', 'RT', 'Charge', 'PTM'))
dat.unit.l$ratio_onetag <- dat.unit.l$ratio_onetag %>% mutate(response=response/denom) %>% select(-denom)
# switch to wide format
dat.unit.w <- lapply(dat.unit.l, function(x){
pivot_wider(data = x, id_cols=-one_of(c('Condition', 'BioReplicate')), names_from=Channel, values_from=response)})
# subtract the spectrum median log2intensity from the observed log2intensities
# subtract the spectrum median log2intensity from the observed log2intensities
dat.norm.w <- dat.unit.w
dat.norm.w$log2_intensity[,channelNames] <- median_sweep(dat.norm.w$log2_intensity[,channelNames], 1, '-')
dat.norm.w$intensity[,channelNames] <- median_sweep(dat.norm.w$intensity[,channelNames], 1, '/')
# dat.norm.w$ratio_rowmean[,channelNames] <- median_sweep(dat.norm.w$ratio_rowmean[,channelNames], 1, '/')
# dat.norm.w$ratio_condmean[,channelNames] <- median_sweep(dat.norm.w$ratio_condmean[,channelNames], 1, '/')
# dat.norm.w$ratio_condpairs[,channelNames] <- median_sweep(dat.norm.w$ratio_condpairs[,channelNames], 1, '/')
# dat.norm.w$ratio_onetag[,channelNames] <- median_sweep(dat.norm.w$ratio_onetag[,channelNames], 1, '/')
Summarize quantification values from PSM to peptide (first step) to protein (second step).
# normalized data
dat.norm.summ.w <- lapply(dat.norm.w, function(x){
y <- x %>% group_by(Run, Protein, Peptide) %>% summarise_at(.vars = channelNames, .funs = median, na.rm=T) %>% summarise_at(.vars = channelNames, .funs = median, na.rm=T) %>% ungroup()
return(y)})
Let’s also summarize the non-normalized data for comparison in the next section.
# non-normalized data
# group by (run,)protein,peptide then summarize twice (once on each level)
# add select() statement because summarise_at is going bananas over character columns
dat.nonnorm.summ.w <- lapply(dat.unit.w, function(x) {
y <- x %>% group_by(Run, Protein, Peptide) %>% summarise_at(.vars = channelNames, .funs = median, na.rm=T) %>% summarise_at(.vars = channelNames, .funs = median, na.rm=T) %>% ungroup()
return(y)})
# medianSweeping: in each channel, subtract/divide median computed across all proteins within the channel
# do the above separately for each MS run
second_median_sweep <- function(dat, fun){
dat.split <- split(dat, dat$Run)
dat.split.norm <- lapply(dat.split, function(y) {
y[,channelNames] <- median_sweep(y[,channelNames], 2, fun); return(y)})
return(bind_rows(dat.split.norm))
}
dat.norm.summ.w$log2_intensity <- second_median_sweep(dat.norm.summ.w$log2_intensity, '-')
dat.norm.summ.w$intensity <- second_median_sweep(dat.norm.summ.w$intensity, '/')
# dat.norm.summ.w$ratio_rowmean <- second_median_sweep(dat.norm.summ.w$ratio_rowmean, '/')
# dat.norm.summ.w$ratio_condmean <- second_median_sweep(dat.norm.summ.w$ratio_condmean, '/')
# dat.norm.summ.w$ratio_condpairs <- second_median_sweep(dat.norm.summ.w$ratio_condpairs, '/')
# dat.norm.summ.w$ratio_onetag <- second_median_sweep(dat.norm.summ.w$ratio_onetag, '/')
# make data completely wide (also across runs)
## normalized data
dat.norm.summ.w2 <- lapply(dat.norm.summ.w, function(x) {
return( x %>% pivot_wider(names_from = Run, values_from = all_of(channelNames), names_glue = "{Run}:{.value}") )
})
## non-normalized data
dat.nonnorm.summ.w2 <- lapply(dat.nonnorm.summ.w, function(x) {
return( x %>% pivot_wider(names_from = Run, values_from = all_of(channelNames), names_glue = "{Run}:{.value}") )
})
# use (half-)wide format
par(mfrow=c(1,2))
for (i in 1:n.comp.variants){
boxplot_w(dat.nonnorm.summ.w[[variant.names[i]]],sample.info, paste('raw', variant.names[i], sep='_'))
boxplot_w(dat.norm.summ.w[[variant.names[i]]], sample.info, paste('normalized', variant.names[i], sep='_'))}
MA plots of two single samples taken from condition 1 and condition 0.125, measured in different MS runs (samples Mixture2_1:127C and Mixture1_2:129N, respectively).
for (i in 1:n.comp.variants){
p1 <- maplot_ils(dat.nonnorm.summ.w2[[variant.names[i]]], ma.onesample.num, ma.onesample.denom, scale=scale.vec[i], paste('raw', variant.names[i], sep='_'), spiked.proteins)
p2 <- maplot_ils(dat.norm.summ.w2[[variant.names[i]]], ma.onesample.num, ma.onesample.denom, scale=scale.vec[i], paste('normalized', variant.names[i], sep='_'), spiked.proteins)
grid.arrange(p1, p2, ncol=2)}
MA plots of all samples from condition 1 and condition 0.125 (quantification values averaged within condition).
channels.num <- sample.info %>% filter(Condition==ma.allsamples.num) %>% select(Sample) %>% pull
channels.denom <- sample.info %>% filter(Condition==ma.allsamples.denom) %>% select(Sample) %>% pull
for (i in 1:n.comp.variants){
p1 <- maplot_ils(dat.nonnorm.summ.w2[[variant.names[i]]], channels.num, channels.denom, scale=scale.vec[i], paste('raw', variant.names[i], sep='_'), spiked.proteins)
p2 <- maplot_ils(dat.norm.summ.w2[[variant.names[i]]], channels.num, channels.denom, scale=scale.vec[i], paste('normalized', variant.names[i], sep='_'), spiked.proteins)
grid.arrange(p1, p2, ncol=2)}
par(mfrow=c(1, 2))
for (i in 1:n.comp.variants){
if (variant.names[i]=='intensity') pca.scale=TRUE else pca.scale=FALSE
pcaplot_ils(dat.nonnorm.summ.w2[[variant.names[i]]] %>% select(-'Protein'), info=sample.info, paste('raw', variant.names[i], sep='_'), scale=pca.scale)
pcaplot_ils(dat.norm.summ.w2[[variant.names[i]]] %>% select(-'Protein'), info=sample.info, paste('normalized', variant.names[i], sep='_'))}
par(mfrow=c(1, 2))
for (i in 1:n.comp.variants){
if (variant.names[i]=='intensity') pcaplot_ils(dat.nonnorm.summ.w2[[variant.names[i]]] %>% filter(Protein %in% spiked.proteins) %>% select(-'Protein'), info=sample.info, paste('raw', variant.names[i], sep='_'), scale=T) else pcaplot_ils(dat.nonnorm.summ.w2[[variant.names[i]]] %>% filter(Protein %in% spiked.proteins) %>% select(-'Protein'), info=sample.info, paste('raw', variant.names[i], sep='_'))
pcaplot_ils(dat.norm.summ.w2[[variant.names[i]]] %>% filter(Protein %in% spiked.proteins) %>% select(-'Protein'), info=sample.info, paste('normalized', variant.names[i], sep='_'))}
par(mfrow=c(1,2))
for (i in 1:n.comp.variants){
dendrogram_ils(dat.nonnorm.summ.w2[[variant.names[i]]] %>% select(-Protein), info=sample.info, paste('raw', variant.names[i], sep='_'))
dendrogram_ils(dat.norm.summ.w2[[variant.names[i]]] %>% select(-Protein), info=sample.info, paste('normalized', variant.names[i], sep='_'))}
Below the histograms of p-values from the linear model explaining the response variable with Run as a covariate.
plots <- vector('list', n.comp.variants)
for (i in 1:n.comp.variants){
dat <- list(dat.nonnorm.summ.l[[variant.names[i]]], dat.norm.summ.l[[variant.names[i]]])
names(dat) <- c(paste('raw', variant.names[i], sep='_'), paste('normalized', variant.names[i], sep='_'))
plots[[i]] <- run_effect_plot(dat)}
grid.arrange(grobs = plots, nrow=n.comp.variants)
# remove ref channels for DEA
# excl.sample1 <- sample.info %>% filter(Condition==ratioCondition) %>% pull(Sample) %>% as.character
# excl.sample2 <- sample.info %>% filter(stri_detect(Sample, fixed=refChannel)) %>% pull(Sample) %>% as.character
# dat.norm.summ.w2$ratio_condpairs <- dat.norm.summ.w2$ratio_condpairs %>% select(-all_of(excl.sample1))
# dat.norm.summ.w2$ratio_onetag <- dat.norm.summ.w2$ratio_onetag %>% select(-all_of(excl.sample2))
NOTE: - actually, lmFit (used in moderated_ttest) was built for log2-transformed data. However, supplying untransformed intensities can also work. This just means that the effects in the linear model are also additive on the untransformed scale, whereas for log-transformed data they are multiplicative on the untransformed scale. Also, there may be a bias which occurs from biased estimates of the population means in the t-tests, as mean(X) is not equal to exp(mean(log(X))).
#{ INVESTIGATE late log2 transform
dat.norm.summ.w2$intensity_lateLog2 <- dat.norm.summ.w2$intensity
channelNames.prefixed <- colnames(dat.norm.summ.w2$intensity %>% select(-Protein))
dat.norm.summ.w2$intensity_lateLog2[,channelNames.prefixed] <- log2(dat.norm.summ.w2$intensity[,channelNames.prefixed])
variant.names <- names(dat.norm.summ.w2)
scale.vec <- c(scale.vec, 'log')
n.comp.variants <- n.comp.variants + 1
#}
design.matrix <- get_design_matrix(referenceCondition, sample.info)
dat.dea <- emptyList(names(dat.norm.summ.w2))
for (i in 1:n.comp.variants) {
this_scale <- scale.vec[match(names(dat.dea)[i], variant.names)]
d <- column_to_rownames(as.data.frame(dat.norm.summ.w2[[variant.names[i]]]), 'Protein')
dat.dea[[variant.names[i]]] <- moderated_ttest(dat=d, design.matrix, scale=this_scale)
}
cm <- conf_mat(dat.dea, 'q.mod', 0.05, spiked.proteins)
print_conf_mat(cm, referenceCondition)
| background | spiked | background | spiked | background | spiked | background | spiked | background | spiked | background | spiked | background | spiked | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| not_DE | 4061 | 4 | 4062 | 6 | 4064 | 19 | 4064 | 19 | 4064 | 19 | 4064 | 19 | 4061 | 4 |
| DE | 3 | 15 | 2 | 13 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | 15 |
| log2_intensity | intensity | ratio_rowmean | ratio_condmean | ratio_condpairs | ratio_onetag | intensity_lateLog2 | |
|---|---|---|---|---|---|---|---|
| Accuracy | 0.998 | 0.998 | 0.995 | 0.995 | 0.995 | 0.995 | 0.998 |
| Sensitivity | 0.789 | 0.684 | 0.000 | 0.000 | 0.000 | 0.000 | 0.789 |
| Specificity | 0.999 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | 0.999 |
| PPV | 0.833 | 0.867 | NaN | NaN | NaN | NaN | 0.833 |
| NPV | 0.999 | 0.999 | 0.995 | 0.995 | 0.995 | 0.995 | 0.999 |
| background | spiked | background | spiked | background | spiked | background | spiked | background | spiked | background | spiked | background | spiked | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| not_DE | 4064 | 18 | 4063 | 15 | 4064 | 19 | 4060 | 19 | 4064 | 19 | 4064 | 19 | 4064 | 18 |
| DE | 0 | 1 | 1 | 4 | 0 | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| log2_intensity | intensity | ratio_rowmean | ratio_condmean | ratio_condpairs | ratio_onetag | intensity_lateLog2 | |
|---|---|---|---|---|---|---|---|
| Accuracy | 0.996 | 0.996 | 0.995 | 0.994 | 0.995 | 0.995 | 0.996 |
| Sensitivity | 0.053 | 0.211 | 0.000 | 0.000 | 0.000 | 0.000 | 0.053 |
| Specificity | 1.000 | 1.000 | 1.000 | 0.999 | 1.000 | 1.000 | 1.000 |
| PPV | 1.000 | 0.800 | NaN | 0.000 | NaN | NaN | 1.000 |
| NPV | 0.996 | 0.996 | 0.995 | 0.995 | 0.995 | 0.995 | 0.996 |
| background | spiked | background | spiked | background | spiked | background | spiked | background | spiked | background | spiked | background | spiked | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| not_DE | 4060 | 5 | 4058 | 3 | 4060 | 8 | 4060 | 10 | 4064 | 19 | 4064 | 19 | 4060 | 5 |
| DE | 4 | 14 | 6 | 16 | 4 | 11 | 4 | 9 | 0 | 0 | 0 | 0 | 4 | 14 |
| log2_intensity | intensity | ratio_rowmean | ratio_condmean | ratio_condpairs | ratio_onetag | intensity_lateLog2 | |
|---|---|---|---|---|---|---|---|
| Accuracy | 0.998 | 0.998 | 0.997 | 0.997 | 0.995 | 0.995 | 0.998 |
| Sensitivity | 0.737 | 0.842 | 0.579 | 0.474 | 0.000 | 0.000 | 0.737 |
| Specificity | 0.999 | 0.999 | 0.999 | 0.999 | 1.000 | 1.000 | 0.999 |
| PPV | 0.778 | 0.727 | 0.733 | 0.692 | NaN | NaN | 0.778 |
| NPV | 0.999 | 0.999 | 0.998 | 0.998 | 0.995 | 0.995 | 0.999 |
# character vectors containing logFC and p-values columns
dea.cols <- colnames(dat.dea[[1]])
logFC.cols <- dea.cols[stri_detect_fixed(dea.cols, 'logFC')]
significance.cols <- dea.cols[stri_detect_fixed(dea.cols, 'q.mod')]
n.contrasts <- length(logFC.cols)
scatterplot_ils(dat.dea, significance.cols, 'q-values', spiked.proteins, referenceCondition)
scatterplot_ils(dat.dea, logFC.cols, 'log2FC', spiked.proteins, referenceCondition)
for (i in 1:n.contrasts){
volcanoplot_ils(dat.dea, i, spiked.proteins, referenceCondition)}
Let’s see whether the spiked protein fold changes make sense
# plot theoretical value (horizontal lines) and violin per variant
violinplot_ils(lapply(dat.dea, function(x) x[spiked.proteins, logFC.cols]), referenceCondition)
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 18363)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_United Kingdom.1252 LC_CTYPE=English_United Kingdom.1252
## [3] LC_MONETARY=English_United Kingdom.1252 LC_NUMERIC=C
## [5] LC_TIME=English_United Kingdom.1252
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] CONSTANd_0.99.0 forcats_0.5.0 stringr_1.4.0 dplyr_1.0.2 purrr_0.3.4
## [6] readr_1.4.0 tidyr_1.1.2 tibble_3.0.4 tidyverse_1.3.0 psych_2.0.9
## [11] limma_3.44.3 kableExtra_1.3.1 dendextend_1.14.0 gridExtra_2.3 stringi_1.5.3
## [16] ggplot2_3.3.2 rmarkdown_2.5 pacman_0.5.1
##
## loaded via a namespace (and not attached):
## [1] ModelMetrics_1.2.2.2 R.methodsS3_1.8.1 knitr_1.30 multcomp_1.4-14
## [5] R.utils_2.10.1 data.table_1.13.2 rpart_4.1-15 doParallel_1.0.16
## [9] generics_0.1.0 BiocGenerics_0.34.0 preprocessCore_1.50.0 TH.data_1.0-10
## [13] webshot_0.5.2 xml2_1.3.2 lubridate_1.7.9 assertthat_0.2.1
## [17] viridis_0.5.1 gower_0.2.2 xfun_0.19 hms_0.5.3
## [21] evaluate_0.14 fansi_0.4.1 dbplyr_2.0.0 readxl_1.3.1
## [25] DBI_1.1.0 tmvnsim_1.0-2 ellipsis_0.3.1 backports_1.1.10
## [29] signal_0.7-6 libcoin_1.0-6 vctrs_0.3.4 Biobase_2.48.0
## [33] caret_6.0-86 withr_2.3.0 mnormt_2.0.2 crayon_1.3.4
## [37] recipes_0.1.15 pkgconfig_2.0.3 labeling_0.4.2 nlme_3.1-149
## [41] ProtGenerics_1.20.0 nnet_7.3-14 rlang_0.4.8 lifecycle_0.2.0
## [45] sandwich_3.0-0 affyio_1.58.0 modelr_0.1.8 cellranger_1.1.0
## [49] matrixStats_0.57.0 Matrix_1.2-18 boot_1.3-25 zoo_1.8-8
## [53] reprex_0.3.0 png_0.1-7 viridisLite_0.3.0 R.oo_1.24.0
## [57] pROC_1.16.2 S4Vectors_0.26.1 scales_1.1.1 magrittr_1.5
## [61] plyr_1.8.6 zlibbioc_1.34.0 compiler_4.0.3 pcaMethods_1.80.0
## [65] cli_2.1.0 affy_1.66.0 MASS_7.3-53 mgcv_1.8-33
## [69] tidyselect_1.1.0 vsn_3.56.0 highr_0.8 yaml_2.2.1
## [73] MALDIquant_1.19.3 grid_4.0.3 tools_4.0.3 rstudioapi_0.12
## [77] foreach_1.5.1 prodlim_2019.11.13 farver_2.0.3 mzID_1.26.0
## [81] digest_0.6.26 BiocManager_1.30.10 lava_1.6.8.1 Rcpp_1.0.5
## [85] broom_0.7.2 ncdf4_1.17 httr_1.4.2 colorspace_1.4-1
## [89] rvest_0.3.6 XML_3.99-0.5 fs_1.5.0 IRanges_2.22.2
## [93] splines_4.0.3 statmod_1.4.35 jsonlite_1.7.1 nloptr_1.2.2.2
## [97] timeDate_3043.102 modeltools_0.2-23 ipred_0.9-9 R6_2.5.0
## [101] pillar_1.4.6 htmltools_0.5.0 glue_1.4.2 minqa_1.2.4
## [105] BiocParallel_1.22.0 class_7.3-17 codetools_0.2-16 mvtnorm_1.1-1
## [109] utf8_1.1.4 lattice_0.20-41 numDeriv_2016.8-1.1 survival_3.2-7
## [113] admisc_0.11 munsell_0.5.0 e1071_1.7-4 iterators_1.0.13
## [117] impute_1.62.0 haven_2.3.1 reshape2_1.4.4 gtable_0.3.0